#---------------------------------------------------------------------------------------------------------# 
#---------------------------------------------------------------------------------------------------------# 
# ------------------------------------        0. Max Joosten         ------------------------------------ #
#-------------------------------------   Last updated: 08.09.2023    -------------------------------------# 
#---------------------------------------------------------------------------------------------------------# 

# 0. Load relevant packages
library(tidyverse)   # Collection of useful packages (mostly dplyr)
library(haven)       # For reading foreign data (read_stata)
library(diagis)      # For weighted standard error 
library(expss)       # For adding labels to variables

# 0. Set working directory
setwd("your path name")

#---------------------------------------------------------------------------------------------------------# 
#---------------------------------------------------------------------------------------------------------# 
# ----------------------------------        1. COB Preparation         ---------------------------------- #
#---------------------------------------------------------------------------------------------------------# 
#---------------------------------------------------------------------------------------------------------# 
# Combining COB waves
files <- list.files(path = getwd())
tmp <- map(files, read_stata)

# Lower case all columnnames
for(i in seq_along(1:length(tmp))){
  colnames(tmp[[i]]) <- tmp[[i]] %>% (colnames) %>% str_to_lower()
}

# Add weight to surveywaves that do not include this variable
for(i in seq_along(1:length(tmp))){
  if (!("wt" %in% names(tmp[[i]]))) tmp[[i]] <- tmp[[i]] %>% add_column(wt=1)
}

# Select variables by creating function to loop over all datasets
extractColumns <- function(x){
  select(x,"geslacht", "leeftijd", "inkomenp", "opleidin", "vc029", "vd012a", "vd012b", "vd012a", "vd012b", "vd012c", "vd012d",
         "vd012e", "vd012f", "vd012g", "vd012h", "vd012i", "vd012j", "vd012k", "vd012l", 
         "vd012m", "vd012n", "vd012o", "vd012p", "vd012q", "meting", "l_dagnr", "l_maand", "l_jaar",
         "wt")
}

# Apply function to all datasets and bind rows
df <- map(tmp,extractColumns) %>% bind_rows()

# Rename column names
df <- dplyr::rename(df,
                    inc = inkomenp,
                    edu = opleidin,
                    party = vc029,
                    pref1 = vd012a,
                    pref2 = vd012b,
                    pref8 = vd012c,
                    pref3 = vd012d,
                    pref_pov = vd012e,
                    pref_emp = vd012f,
                    pref4 = vd012h,
                    pref5 = vd012i,
                    pref6 = vd012l,
                    pref_safety = vd012n,
                    pref_ter = vd012o,
                    wave = meting)

# Because wave 38 is collected before the realization of the coalition agreement, I exclude this and take take wave 40 as the first wave after the election.
# To get the right lags, I already exclude this wave here.
df <- df %>% filter(!wave %in% c(38))

# Recode spending categories as numeric
for(i in df %>% select(starts_with("pref")) %>% colnames()){
  df[[i]] <- as.numeric(df[[i]])
}

# Recode spending preferences so they range from -1 to +1
df <- df %>%
  mutate(across(starts_with('pref'), 
                ~dplyr::recode(., `1` = -1, `2` = -0.5, `3` = 0, 
                               `4` = 0.5, `5` = 1, `6` = NA_real_)))

# Recode two variables that correspond to a single party position (CPB) item
df <- df %>%
  mutate(pref7 = (pref_safety + pref_ter) / 2,
         pref9 = (pref_pov + pref_emp) / 2)

# Recode education into three groups
df <- df %>%
  mutate(edu = case_when(edu %in% c(1,2) ~ 3,
                         edu %in% c(3,4) ~ 2,
                         edu %in% c(5,6,7) ~ 1))

# Recode income (this drops missing variables)
df <- df %>%
  mutate(inc = case_when(inc %in% 1 ~ 1,
                         inc %in% 2 ~ 2,
                         inc %in% 3 ~ 3))

# Recode parties and drop parties that can't be matched to CPB
df <- df %>%
  mutate(party = case_when(
    party == 1  ~ 1, 
    party == 2  ~ 2,
    party == 3  ~ 3,
    party == 4  ~ 4,
    party == 6  ~ 5,
    party == 7  ~ 6,
    party == 8  ~ 7,
    party == 10  ~ 8,
    party == 5 ~ 9,
    party == 97 ~ 10)) 

# Keep only respondents with party and education variable, and party and income variable
df_edu <- df %>% 
  filter(!is.na(party) & !is.na(edu))

df_inc <- df %>% 
  filter(!is.na(party) & !is.na(inc))

# Reorder and keep relevant variables
df_edu <- df_edu[, c("wave", "party", "edu", "pref1", "pref2", "pref3", "pref4", "pref5", "pref6", "pref7", "pref8", "pref9", "wt")]
df_inc <- df_inc[, c("wave", "party", "inc", "pref1", "pref2", "pref3", "pref4", "pref5", "pref6", "pref7", "pref8", "pref9", "wt")]

# Reshape the data so that each respondent-policy area is an observation for education groups and income groups
df_edu <- df_edu %>%
  pivot_longer(!c("wave", "party", "edu", "wt"), names_to = "area", values_to = "value")

df_inc <- df_inc %>%
  pivot_longer(!c("wave", "party", "inc", "wt"), names_to = "area", values_to = "value")

# Recode for merging later on
df_edu <- df_edu %>%
  mutate(area = case_when(
    area == "pref1" ~ 1, 
    area == "pref2" ~ 2,
    area == "pref3" ~ 3,
    area == "pref4" ~ 4,
    area == "pref5" ~ 5,
    area == "pref6" ~ 6,
    area == "pref7" ~ 7,
    area == "pref8" ~ 8,
    area == "pref9" ~ 9))

df_inc <- df_inc %>%
  mutate(area = case_when(
    area == "pref1" ~ 1, 
    area == "pref2" ~ 2,
    area == "pref3" ~ 3,
    area == "pref4" ~ 4,
    area == "pref5" ~ 5,
    area == "pref6" ~ 6,
    area == "pref7" ~ 7,
    area == "pref8" ~ 8,
    area == "pref9" ~ 9))

# Collapsing dataset to group means
df_edu <- df_edu %>% dplyr::group_by(wave, party, area, edu) %>% 
  dplyr::summarise(epref = weighted.mean(value, wt, na.rm = T),
                   ewt = weighted_se(value, wt, na.rm = T)) %>% ungroup()

df_inc <- df_inc %>% dplyr::group_by(wave, party, area, inc) %>% 
  dplyr::summarise(ipref = weighted.mean(value, wt, na.rm = T),
                   iwt = weighted_se(value, wt, na.rm = T)) %>% ungroup()

# New weights are the inverse of the standard error around the group mean
df_edu <- df_edu %>% mutate(ewt = 1/ewt)
df_inc <- df_inc %>% mutate(iwt = 1/iwt)

df_edu$ewt[df_edu$ewt %in% Inf] <- NA
df_inc$iwt[df_inc$iwt %in% Inf] <- NA

df_inc <- df_inc %>% 
  mutate(iwt = iwt/mean(iwt, na.rm = T))

df_edu <- df_edu %>% 
  mutate(ewt = ewt/mean(ewt, na.rm = T))

# Reshape again so that there are separate variables for group preferences
df_edu <- df_edu %>%
  pivot_wider(names_from = c("edu"), values_from = c("epref", "ewt"))

df_inc <- df_inc %>%
  pivot_wider(names_from = c("inc"), values_from = c("ipref", "iwt"))

# Give weight of 1 when weight is missing
df_edu$ewt_1[df_edu$ewt_1 %in% NA] <- 1
df_edu$ewt_2[df_edu$ewt_2 %in% NA] <- 1
df_edu$ewt_3[df_edu$ewt_3 %in% NA] <- 1

df_inc$iwt_1[df_inc$iwt_1 %in% NA] <- 1
df_inc$iwt_2[df_inc$iwt_2 %in% NA] <- 1
df_inc$iwt_3[df_inc$iwt_3 %in% NA] <- 1

# Weights for models with multiple education groups
df_edu <- df_edu %>% 
  mutate(ewt123 = (ewt_1 + ewt_2 + ewt_3) / 3,
         ewt12 = (ewt_1 + ewt_2) / 2,
         ewt13 = (ewt_1 + ewt_3) / 2,
         ewt23 = (ewt_2 + ewt_3) / 2)

df_inc <- df_inc %>% 
  mutate(iwt123 = (iwt_1 + iwt_2 + iwt_3) / 3,
         iwt12 = (iwt_1 + iwt_2) / 2,
         iwt13 = (iwt_1 + iwt_3) / 2,
         iwt23 = (iwt_2 + iwt_3) / 2)

# Important: there are missing values for higher educated SGP voters in wave 8. I replace this with the mean of the wave before and after (wave 4 and wave 12).
for(i in 1:9){
  df_edu$epref_3[df_edu$party %in% 8 & df_edu$wave %in% 8 & df_edu$area %in% i] <- mean(subset(df_edu, party %in% 8 & (wave %in% 4 | wave %in% 12) & area %in% i)$epref_3)
}

# Lag and lead values
df_edu <- df_edu %>% group_by(party, area) %>%
  mutate(l_epref_1 = dplyr::lag(epref_1, 1, order_by = wave, na.rm =T),          # 1 lag (2010-II)
         l_epref_2 = dplyr::lag(epref_2, 1, order_by = wave, na.rm =T),          # 1 lag (2012-II)
         l_epref_3 = dplyr::lag(epref_3, 1, order_by = wave, na.rm =T),          # 1 lag (2017-II)
         
         l2_epref_1 = dplyr::lag(epref_1, 2, order_by = wave, na.rm =T),         # 2 lags
         l2_epref_2 = dplyr::lag(epref_2, 2, order_by = wave, na.rm =T),         # 2 lags
         l2_epref_3 = dplyr::lag(epref_3, 2, order_by = wave, na.rm =T),         # 2 lags
         
         l3_epref_1 = dplyr::lag(epref_1, 3, order_by = wave, na.rm =T),         # 3 lags
         l3_epref_2 = dplyr::lag(epref_2, 3, order_by = wave, na.rm =T),         # 3 lags
         l3_epref_3 = dplyr::lag(epref_3, 3, order_by = wave, na.rm =T),         # 3 lags
         
         lead_epref_1 = dplyr::lead(epref_1, 1, order_by = wave, na.rm =T),      # lead
         lead_epref_2 = dplyr::lead(epref_2, 1, order_by = wave, na.rm =T),      # lead
         lead_epref_3 = dplyr::lead(epref_3, 1, order_by = wave, na.rm =T),      # lead
         
         lead2_epref_1 = dplyr::lead(epref_1, 2, order_by = wave, na.rm =T),     # 2 leads
         lead2_epref_2 = dplyr::lead(epref_2, 2, order_by = wave, na.rm =T),     # 2 leads
         lead2_epref_3 = dplyr::lead(epref_3, 2, order_by = wave, na.rm =T))     # 2 leads


df_inc <- df_inc %>% group_by(party, area) %>%
  mutate(l_ipref_1 = dplyr::lag(ipref_1, 1, order_by = wave, na.rm =T),
         l_ipref_2 = dplyr::lag(ipref_2, 1, order_by = wave, na.rm =T),
         l_ipref_3 = dplyr::lag(ipref_3, 1, order_by = wave, na.rm =T),
         
         l2_ipref_1 = dplyr::lag(ipref_1, 2, order_by = wave, na.rm =T),
         l2_ipref_2 = dplyr::lag(ipref_2, 2, order_by = wave, na.rm =T),
         l2_ipref_3 = dplyr::lag(ipref_3, 2, order_by = wave, na.rm =T),
         
         l3_ipref_1 = dplyr::lag(ipref_1, 3, order_by = wave, na.rm =T),
         l3_ipref_2 = dplyr::lag(ipref_2, 3, order_by = wave, na.rm =T),
         l3_ipref_3 = dplyr::lag(ipref_3, 3, order_by = wave, na.rm =T),
         
         lead_ipref_1 = dplyr::lead(ipref_1, 1, order_by = wave, na.rm =T),
         lead_ipref_2 = dplyr::lead(ipref_2, 1, order_by = wave, na.rm =T),
         lead_ipref_3 = dplyr::lead(ipref_3, 1, order_by = wave, na.rm =T),   
         
         lead2_ipref_1 = dplyr::lead(ipref_1, 2, order_by = wave, na.rm =T),
         lead2_ipref_2 = dplyr::lead(ipref_2, 2, order_by = wave, na.rm =T),
         lead2_ipref_3 = dplyr::lead(ipref_3, 2, order_by = wave, na.rm =T))


# Only keeping relevant waves
df_edu <- df_edu %>% 
  filter(wave %in% c(12, 20, 40))

df_inc <- df_inc %>% 
  filter(wave %in% c(12, 20, 40))

# Creating year variable
df_edu <- df_edu %>%
  mutate(year = case_when(wave %in% 12 ~ 2010,          # 2010-IV
                          wave %in% 20 ~ 2012,          # 2012-IV
                          wave %in% 40 ~ 2017))         # 2017-IV

# Join data with survey data and keep only the dataframe
df <- left_join(df_edu, df_inc, by = c("wave", "area", "party"))

rm(list=setdiff(ls(), c("df")))

#---------------------------------------------------------------------------------------------------------# 
#---------------------------------------------------------------------------------------------------------# 
# --------------------------------          2. CPB Preparation          --------------------------------- #
#---------------------------------------------------------------------------------------------------------# 
#---------------------------------------------------------------------------------------------------------# 
cpb <- read_stata("../cpb/cpb.dta") %>% filter(year %in% c(2006:2017))

# Rename colum nnames
cpb <- rename(cpb, 
              pp1 = budgetaryimprovements,
              pp2 = spend_milieu,
              pp3 = spend_internationalesamenwerkng,
              pp4 = spend_onderwijs,
              pp5 = spend_bereikbaarheid,
              pp6 = spend_zorg,
              pp7 = spend_veiligheid,
              pp8 = spend_defensie,
              pp9 = spend_socialezekerheid)

# Select only matching data
cpb <- cpb %>% select(party, year, starts_with("pp"))

# Order variables
cpb <- cpb[, c("party", "year", "pp1", "pp2", "pp3", "pp4", "pp5", "pp6", "pp7", "pp8", "pp9")]

# Drop parties and years that can't be matched to COB
cpb <- cpb %>% filter(!party %in% c("DENK", "50PLUS") & !year %in% c(2002, 2007, 2021))

# Reshape the data so that each respondent-policy area is an observation
cpb <- cpb %>% 
  pivot_longer(!c(year, party), names_to = c("var", "area"), names_pattern = "(.*)(\\d)") %>% 
  group_by(year, party, area, var) %>% 
  mutate(id = row_number()) %>% 
  ungroup() %>% 
  pivot_wider(names_from = var, values_from = value) %>% select(!id)

# For merging 
cpb$area <- as.numeric(cpb$area)

# Party is still a string variable, recode to numeric to merge with survey data
cpb <- cpb %>%
  mutate(party = case_when(
    party == "CDA" ~ 1, 
    party == "PvdA" ~ 2,
    party == "SP" ~ 3,
    party == "VVD" ~ 4,
    party == "GL" ~ 5,
    party == "CU" ~ 6,
    party == "D66" ~ 7,
    party == "SGP" ~ 8,
    party == "PVV" ~ 9)) 

# Divide by 10 to facilitate interpretation of the results
cpb$pp <- cpb$pp / 10

# Merge data
df <- left_join(df, cpb, by = c("party", "year", "area"))

#---------------------------------------------------------------------------------------------------------# 
#---------------------------------------------------------------------------------------------------------# 
# --------------------------------          3. CBS Preparation          --------------------------------- #
#---------------------------------------------------------------------------------------------------------# 
#---------------------------------------------------------------------------------------------------------# 
# Load data, select relevant variables and rename
cbs <- read.csv("../Overheidsuitgaven__functies_20102021_101316.csv", sep = ";") %>% 
  select(Overheidsfuncties, Perioden, Overheidsuitgaven.en.bestedingen..mln.euro.) %>%
  rename(area = Overheidsfuncties,
         year = Perioden,
         gov_expenditure = Overheidsuitgaven.en.bestedingen..mln.euro.)

# Recode area names so that they match
cbs <- cbs %>%
  mutate(area = case_when(
    area == "Totaal" ~ 99,
    area == "1.7 Verrichtingen overheidsschuld" ~ 1, 
    area == "5. Milieubescherming" ~ 2,
    area == "1.2 Hulp buitenland" ~ 3,
    area == "9. Onderwijs" ~ 4,
    area == "4.5 Vervoer" ~ 5,
    area == "7. Volksgezondheid" ~ 6,
    area == "3. Openbare orde en veiligheid" ~ 7,
    area == "2. Landsverdediging" ~ 8,
    area == "10. Sociale bescherming" ~ 9)) %>% 
  filter(year %in% c(2006,2010,2012,2017) & !area %in% c(99))

cbs <- cbs %>% group_by(area) %>% 
  mutate(spendlag = gov_expenditure - (gov_expenditure - lag(gov_expenditure, order_by = year))) %>% 
  mutate(spendchange = (gov_expenditure - spendlag) / spendlag)

cbs <- cbs %>% select(year, area, spendchange) %>% filter(year %in% c(2010,2012,2017))

# Merge with voter and party data
df <- left_join(df, cbs, by = c("year", "area"))

# Cleaning global environment
rm(list=setdiff(ls(), c("df")))

# Remove PVV 2017, no party position variable (36 observations)
df <- df %>% 
  filter(!is.na(pp))

# Remove budget balance (26 observations)
df <- df %>% filter(!area %in% 1)

# Add labels and save
# Label variables
df <- apply_labels(df,
                   wave = "Survey wave",
                   party = "Party",
                   party = c("CDA" = 1,
                             "PvdA" = 2,
                             "SP" = 3,
                             "VVD" = 4,
                             "GL" = 5,
                             "CU" = 6,
                             "D66" = 7,
                             "SGP" = 8,
                             "PVV" = 9),
                   area = "Spending area",
                   area = c("Environment" = 2,
                            "Development" = 3,
                            "Education" = 4,
                            "Transport" = 5,
                            "Health care" = 6,
                            "Crime" = 7,
                            "Defense" = 8,
                            "Social security" = 9),
                   epref_1 = "Low educated preferences",
                   epref_2 = "Middle educated preferences",
                   epref_3 = "High educated preferences",
                   ewt_1 = "Weight low educated",
                   ewt_2 = "Weight middle educated",
                   ewt_3 = "weight high educated",
                   ewt123 = "Weight all",
                   ewt12 = "Weight low + middle",
                   ewt13 = "Weight low + high",
                   ewt23 = "Weight middle + high",
                   
                   l_epref_1 = "Lagged low educated preferences",
                   l_epref_2 = "Lagged middle educated preferences",
                   l_epref_3 = "Lagged high educated preferences",
                   l2_epref_1 = "Lagged (t-2) low educated preferences",
                   l2_epref_2 = "Lagged (t-2) middle educated preferences",
                   l2_epref_3 = "Lagged (t-2) high educated preferences",
                   l3_epref_1 = "Lagged (t-3) low educated preferences",
                   l3_epref_2 = "Lagged (t-3) middle educated preferences",
                   l3_epref_3 = "Lagged (t-3) high educated preferences",
                   lead_epref_1 = "Lead low educated preferences",
                   lead_epref_2 = "Lead middle educated preferences",
                   lead_epref_3 = "Lead high educated preferences",
                   lead2_epref_1 = "Lead (t+2) low educated preferences",
                   lead2_epref_2 = "Lead (t+2) middle educated preferences",
                   lead2_epref_3 = "Lead (t+2) high educated preferences",
                   
                   ipref_1 = "Low income preferences",
                   ipref_2 = "Middle income preferences",
                   ipref_3 = "High income preferences",
                   
                   iwt_1 = "Weight low income",
                   iwt_2 = "Weight middle income",
                   iwt_3 = "weight high income",
                   iwt123 = "Weight all income",
                   iwt12 = "Weight low + middle income",
                   iwt13 = "Weight low + high income",
                   iwt23 = "Weight middle + high income",
                   
                   l_ipref_1 = "Lagged low income preferences",
                   l_ipref_2 = "Lagged middle income preferences",
                   l_ipref_3 = "Lagged high income preferences",
                   l2_ipref_1 = "Lagged (t-2) low income preferences",
                   l2_ipref_2 = "Lagged (t-2) middle income preferences",
                   l2_ipref_3 = "Lagged (t-2) high income preferences",
                   l3_ipref_1 = "Lagged (t-3) low income preferences",
                   l3_ipref_2 = "Lagged (t-3) middle income preferences",
                   l3_ipref_3 = "Lagged (t-3) high income preferences",
                   lead_ipref_1 = "Lead low income preferences",
                   lead_ipref_2 = "Lead middle income preferences",
                   lead_ipref_3 = "Lead high income preferences",
                   lead2_ipref_1 = "Lead (t+2) low income preferences",
                   lead2_ipref_2 = "Lead (t+2) middle income preferences",
                   lead2_ipref_3 = "Lead (t+2) high income preferences",
                   
                   year = "Year",             
                   pp = "Party preferences",                    
                   spendchange = "Relative change in absolute spending, per spending area")

# Export data
write_dta(df, ("../df_complete.dta"))
